In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
file_path = '/Users/lakshay/Desktop/graucia/loblaws.xlsx'
df = pd.read_excel(file_path, sheet_name='E Comm')  # Load the E Comm sheet
In [ ]:
# Display the first few rows of the dataframe to understand its structure
print(df.head())
print(df.columns)

# Data Cleaning
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numerical NaNs with median
df.fillna(df.mode().iloc[0], inplace=True)  # Fill categorical NaNs with mode
   CustomerID  Churn  Tenure PreferredLoginDevice  CityTier  WarehouseToHome  \
0       50001      1     4.0         Mobile Phone         3              6.0   
1       50002      1     NaN                Phone         1              8.0   
2       50003      1     NaN                Phone         1             30.0   
3       50004      1     0.0                Phone         3             15.0   
4       50005      1     0.0                Phone         1             12.0   

  PreferredPaymentMode  Gender  HourSpendOnApp  NumberOfDeviceRegistered  \
0           Debit Card  Female             3.0                         3   
1                  UPI    Male             3.0                         4   
2           Debit Card    Male             2.0                         4   
3           Debit Card    Male             2.0                         4   
4                   CC    Male             NaN                         3   

     PreferedOrderCat  SatisfactionScore MaritalStatus  NumberOfAddress  \
0  Laptop & Accessory                  2        Single                9   
1              Mobile                  3        Single                7   
2              Mobile                  3        Single                6   
3  Laptop & Accessory                  5        Single                8   
4              Mobile                  5        Single                3   

   Complain  OrderAmountHikeFromlastYear  CouponUsed  OrderCount  \
0         1                         11.0         1.0         1.0   
1         1                         15.0         0.0         1.0   
2         1                         14.0         0.0         1.0   
3         0                         23.0         0.0         1.0   
4         0                         11.0         1.0         1.0   

   DaySinceLastOrder  CashbackAmount  
0                5.0          159.93  
1                0.0          120.90  
2                3.0          120.28  
3                3.0          134.07  
4                3.0          129.60  
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
       'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')
In [ ]:
# Ensure Churn is treated as a categorical variable
df['Churn'] = df['Churn'].astype(int)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

# Encode categorical columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Identify numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", numerical_columns)
Categorical columns: Index(['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender',
       'PreferedOrderCat', 'MaritalStatus'],
      dtype='object')
Numerical columns: Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
       'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')
In [ ]:
# Descriptive Statistics
print(df.describe())

# Churn Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()

# Feature Distributions with respect to Churn
for column in df.columns:
    if column != 'Churn':
        plt.figure(figsize=(8, 6))
        sns.histplot(data=df, x=column, hue='Churn', multiple='stack', kde=True)
        plt.title(f'Distribution of {column} with respect to Churn')
        plt.show()
         CustomerID        Churn       Tenure  PreferredLoginDevice  \
count   5630.000000  5630.000000  5630.000000           5630.000000   
mean   52815.500000     0.168384    10.134103              0.928419   
std     1625.385339     0.374240     8.357951              0.709822   
min    50001.000000     0.000000     0.000000              0.000000   
25%    51408.250000     0.000000     3.000000              0.000000   
50%    52815.500000     0.000000     9.000000              1.000000   
75%    54222.750000     0.000000    15.000000              1.000000   
max    55630.000000     1.000000    61.000000              2.000000   

          CityTier  WarehouseToHome  PreferredPaymentMode       Gender  \
count  5630.000000      5630.000000           5630.000000  5630.000000   
mean      1.654707        15.566785              3.548135     0.601066   
std       0.915389         8.345961              1.389659     0.489723   
min       1.000000         5.000000              0.000000     0.000000   
25%       1.000000         9.000000              3.000000     0.000000   
50%       1.000000        14.000000              4.000000     1.000000   
75%       3.000000        20.000000              4.000000     1.000000   
max       3.000000       127.000000              6.000000     1.000000   

       HourSpendOnApp  NumberOfDeviceRegistered  PreferedOrderCat  \
count     5630.000000               5630.000000       5630.000000   
mean         2.934636                  3.688988          2.369627   
std          0.705528                  1.023999          1.411435   
min          0.000000                  1.000000          0.000000   
25%          2.000000                  3.000000          2.000000   
50%          3.000000                  4.000000          2.000000   
75%          3.000000                  4.000000          4.000000   
max          5.000000                  6.000000          5.000000   

       SatisfactionScore  MaritalStatus  NumberOfAddress     Complain  \
count        5630.000000    5630.000000      5630.000000  5630.000000   
mean            3.066785       1.168384         4.214032     0.284902   
std             1.380194       0.664344         2.583586     0.451408   
min             1.000000       0.000000         1.000000     0.000000   
25%             2.000000       1.000000         2.000000     0.000000   
50%             3.000000       1.000000         3.000000     0.000000   
75%             4.000000       2.000000         6.000000     1.000000   
max             5.000000       2.000000        22.000000     1.000000   

       OrderAmountHikeFromlastYear   CouponUsed   OrderCount  \
count                  5630.000000  5630.000000  5630.000000   
mean                     15.674600     1.716874     2.961812   
std                       3.591058     1.857640     2.879248   
min                      11.000000     0.000000     1.000000   
25%                      13.000000     1.000000     1.000000   
50%                      15.000000     1.000000     2.000000   
75%                      18.000000     2.000000     3.000000   
max                      26.000000    16.000000    16.000000   

       DaySinceLastOrder  CashbackAmount  
count        5630.000000     5630.000000  
mean            4.459325      177.223030  
std             3.570626       49.207036  
min             0.000000        0.000000  
25%             2.000000      145.770000  
50%             3.000000      163.280000  
75%             7.000000      196.392500  
max            46.000000      324.990000  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Correlation Matrix
plt.figure(figsize=(14, 12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pair Plots
sns.pairplot(df, hue='Churn', diag_kind='kde')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Further EDA based on feature importance
import shap

# Data Splitting
X = df.drop('Churn', axis=1)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
from sklearn.ensemble import RandomForestClassifier
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)
Out[ ]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [ ]:
# SHAP for Model Interpretation
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

# Feature Importance from Random Forest
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
No description has been provided for this image
No description has been provided for this image